nusmodsnusmods API at https://nusmods.com/api/.# load bidding data
# calculate loading times
before <- Sys.time()
# read data directly from URL
myjson <- fromJSON(file = url("https://api.nusmods.com/corsBiddingStatsRaw.json"))
# create empty dataframe which will act as a container to be populated with data
myBid <- data.frame()
# for each element in the myjson list, append it to myBid
for(r in 1:length(myjson))
{
if(myjson[[r]]$Semester == 1 | myjson[[r]]$Semester == 2)
{
myBid <- rbind(myBid, myjson[[r]])
}
myjson[[r]] <- NA
}
# calculate loading time
after <- Sys.time()
after - before
# save
saveRDS(myBid, file = "myBid.RDS")myBid.RDS# create empty dataframe which will act as a container to be populated with data
myModInfo <- data.frame()
# looping through each year
for(year in c(2011:2018))
{
for(semester in c(1,2))
{
# create the url where data is to be extracted from
myurl <- paste0("https://api.nusmods.com/", year, "-", year + 1, "/", semester, "/moduleTimetableDeltaRaw.json")
myjson <- fromJSON(file = url(myurl))
# for each element in the myjson list, append it to myModInfo
for(r in 1:length(myjson))
{
if(isTRUE(str_detect(myjson[[r]]$ModuleCode, "^PL")))
{
if(myjson[[r]]$Semester == 1 | myjson[[r]]$Semester == 2)
{
myModInfo <- rbind(myModInfo, myjson[[r]])
}
}
myjson[[r]] <- NA
}
cat(year, "Semester", semester, "Done!")
}
}
# save
saveRDS(myModInfo, file = "myModInfo.RDS")myModInfo.RDSmyModInfomyModInfo.
# only keep the Psychology modules information
myModInfo <- subset(myModInfo,
str_detect(myModInfo$ModuleCode, "^PL"))
# remove information about tutorials
myModInfo <- subset(myModInfo,
myModInfo$LessonType != "TUTORIAL")
# only keep these columns
myModInfo <- myModInfo[,grep("ModuleCode|DayText|StartTime|Semester|AcadYear", names(myModInfo))]
# remove duplicated rows based on columns of ModuleCode, Acadyear, Semester, StartTime and DayText
myModInfo <- distinct(myModInfo,
ModuleCode, AcadYear, Semester, StartTime, DayText)myBidmyBid.
# remove non-psychology modules
myBid <- subset(myBid,
# only keep rows where module code begins with PL
str_detect(myBid$ModuleCode, "^PL"))
# also remove Roots and Wings (PLS8001) and psychology for non-psych students (PLB1201)
myBid <- subset(myBid,
!str_detect(myBid$ModuleCode, "PLS|PLB"))
# remove the rounds where it was reserved
myBid <- subset(myBid,
!str_detect(myBid$StudentAcctType, "Reserved"))
# remove information from bidding rounds involving [G] accounts
myBid <- subset(myBid,
!str_detect(myBid$StudentAcctType, "[G]"))
# remove unneeded columns
myBid <- myBid[, -grep("Group|Faculty", names(myBid))]myModInfo and myBid.# transform these columns to numeric
for(r in c("Quota", "Bidders", "LowestBid", "LowestSuccessfulBid", "HighestBid", "StartTime"))
{
mydata[,grep(r, names(mydata))] <- as.numeric(mydata[,grep(r, names(mydata))])
}
# transform these columns to factors
for(r in c("AcadYear", "Semester", "ModuleCode", "Round", "StudentAcctType", "DayText"))
{
mydata[,grep(r, names(mydata))] <- factor(mydata[,grep(r, names(mydata))])
}# create vector of the column names which are factors
facnames <- names(select_if(mydata, is.factor))
# factor names without ModuleCode and StudentAcctType
facnames.mod <- facnames[-grep("ModuleCode", facnames)]
# create vector of the column names which are numeric
numnames <- names(select_if(mydata, is.numeric))
# numeric names without StartTime
numnames.time <- names(select_if(mydata, is.numeric))[-grep("StartTime", numnames)]DayText LevelsBidders is calculated across all academic years, all bidding rounds, all modules…## ModuleCode AcadYear Semester Round Quota Bidders LowestBid LowestSuccessfulBid HighestBid StudentAcctType StartTime DayText
## PL1101E: 210 2013/2014:404 1:1109 1A:634 Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0 New Students [P] : 314 Min. : 800 Monday :380
## PL3232 : 74 2015/2016:384 2:1097 1B:389 1st Qu.: 4.00 1st Qu.: 1.00 1st Qu.: 1.00 1st Qu.: 1.0 1st Qu.: 1.0 NUS Students [P] : 331 1st Qu.:1100 Tuesday :461
## PL3236 : 72 2014/2015:349 1C:244 Median : 16.00 Median : 3.00 Median : 1.00 Median : 1.0 Median : 301.0 Returning Students [P] :1191 Median :1300 Wednesday:558
## PL3235 : 71 2016/2017:272 2A:300 Mean : 26.87 Mean : 13.43 Mean : 69.75 Mean : 233.3 Mean : 700.4 Returning Students and New Students [P]: 370 Mean :1306 Thursday :493
## PL3234 : 70 2012/2013:263 2B:308 3rd Qu.: 32.00 3rd Qu.: 9.00 3rd Qu.: 5.00 3rd Qu.: 101.0 3rd Qu.:1173.8 3rd Qu.:1500 Friday :314
## PL3233 : 67 2011/2012:194 3A:173 Max. :430.00 Max. :440.00 Max. :2430.00 Max. :3459.0 Max. :4801.0 Max. :1900
## (Other):1642 (Other) :340 3B:158
# plot the categorical variables
# note: I did not include ModuleCode in this exploratory graph because it has too many levels (83)
for(r in facnames.mod)
{
cat(paste0("Histogram Of ", r))
plot(
ggplot(data = mydata, aes_string(x = r, fill = r)) +
geom_histogram(stat = "count") +
ylab("Count") +
ggtitle(paste0("Count of ", r)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, size = 6, vjust = -0.3),
axis.title.x = element_blank(),
legend.position = "none")
)
}## Histogram Of AcadYear
## Histogram Of Semester
## Histogram Of Round
## Histogram Of StudentAcctType
## Histogram Of DayText
# plot the continuous variables
for(r in numnames)
{
cat(paste0("Histogram Of ", r))
plot(
ggplot(data = mydata, aes_string(x = r, fill = r)) +
geom_histogram(bins = 90, fill = "violetred") +
ylab("Histogram") +
ggtitle(paste0("Frequency of ", r)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, size = 6, vjust = -0.3),
axis.title.x = element_text())
)
}## Histogram Of Quota
## Histogram Of Bidders
## Histogram Of LowestBid
## Histogram Of LowestSuccessfulBid
## Histogram Of HighestBid
## Histogram Of StartTime
# create vector to loop across
for(r in 1:length(facnames.mod))
{
for(i in 1:length(facnames.mod))
{
# dont do anything if they are the same or the graph has been made before
if(i == r | i < r)
{
} else {
cat(paste0(facnames.mod[r]," ~ ",facnames.mod[i]))
# create formula for xtabs
tempform <- paste0("~ ", facnames.mod[r], " + ", facnames.mod[i])
# temp is a dataframe that is only going to exist in this section
# and overwritten with each loop
temp <- as.data.frame(xtabs(eval(parse(text = tempform)),
data = mydata,
subset = NULL))
plot(
ggplot(data = temp, aes_string(x = facnames.mod[r], y = facnames.mod[i], fill = "Freq", label = "Freq")) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "white", high = "violetred") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = -0.3),
legend.position = "none")
)
}
}
}## AcadYear ~ Semester
## AcadYear ~ Round
## AcadYear ~ StudentAcctType
## AcadYear ~ DayText
## Semester ~ Round
## Semester ~ StudentAcctType
## Semester ~ DayText
## Round ~ StudentAcctType
## Round ~ DayText
## StudentAcctType ~ DayText
for(r in 1:length(numnames))
{
for(i in 1:length(numnames))
{
# dont do anything if they are the same or the graph has been made before
if(i == r | i < r)
{
} else {
cat(paste0(numnames[r]," ~ ",numnames[i]))
# create formula for lm()
tempform.std <- paste0("scale(", numnames[i],")", " ~ ", "scale(", numnames[r], ")")
tempform <- paste0(numnames[i], " ~ ", numnames[r])
# regress to get best fit line
# standardized
stdreg <- lm(eval(parse(text = tempform.std)),
data = mydata)
# unstandardized
reg <- lm(eval(parse(text = tempform)),
data = mydata)
plot(
ggplot(data = mydata, aes_string(x = numnames[r], y = numnames[i])) +
geom_point(color = "violetred", size = 2, alpha = 0.3) +
theme_classic() +
geom_abline(slope = reg$coefficients[2], intercept = reg$coefficients[1], lty = "dashed") +
geom_label(aes(x = Inf, y = Inf, label = paste0("Standardized Regression Coefficient = ",
round(stdreg$coefficients[2],3)),
hjust = 1, vjust = 2)) +
theme(axis.text.x = element_text(angle = 90, vjust = -0.3))
)
}
}
}## Quota ~ Bidders
## Quota ~ LowestBid
## Quota ~ LowestSuccessfulBid
## Quota ~ HighestBid
## Quota ~ StartTime
## Bidders ~ LowestBid
## Bidders ~ LowestSuccessfulBid
## Bidders ~ HighestBid
## Bidders ~ StartTime
## LowestBid ~ LowestSuccessfulBid
## LowestBid ~ HighestBid
## LowestBid ~ StartTime
## LowestSuccessfulBid ~ HighestBid
## LowestSuccessfulBid ~ StartTime
## HighestBid ~ StartTime
corrplot.mixed(cor(mydata[,grep(paste0(numnames.time, collapse = "|"), names(mydata))]),
upper = "color",
tl.pos = "lt",
tl.cex = 0.5,
cl.cex = 0.5)for(r in facnames.mod)
{
for(i in numnames)
{
cat(paste0(r," ~ ",i))
# graph
plot(
ggplot(data = mydata, aes_string(x = r, y = i, fill = r)) +
geom_boxplot() +
theme_classic() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, vjust = -0.3))
)
}
}## AcadYear ~ Quota
## AcadYear ~ Bidders
## AcadYear ~ LowestBid
## AcadYear ~ LowestSuccessfulBid
## AcadYear ~ HighestBid
## AcadYear ~ StartTime
## Semester ~ Quota
## Semester ~ Bidders
## Semester ~ LowestBid
## Semester ~ LowestSuccessfulBid
## Semester ~ HighestBid
## Semester ~ StartTime
## Round ~ Quota
## Round ~ Bidders
## Round ~ LowestBid
## Round ~ LowestSuccessfulBid
## Round ~ HighestBid
## Round ~ StartTime
## StudentAcctType ~ Quota
## StudentAcctType ~ Bidders
## StudentAcctType ~ LowestBid
## StudentAcctType ~ LowestSuccessfulBid
## StudentAcctType ~ HighestBid
## StudentAcctType ~ StartTime
## DayText ~ Quota
## DayText ~ Bidders
## DayText ~ LowestBid
## DayText ~ LowestSuccessfulBid
## DayText ~ HighestBid
## DayText ~ StartTime
# create new variable that indicates the level of the module, based on their module code
mydata$Level <- ifelse(str_detect(mydata$ModuleCode, "1[0-9][0-9][0-9]"), "Level 1",
ifelse(str_detect(mydata$ModuleCode, "2[0-9][0-9][0-9]"), "Level 2",
ifelse(str_detect(mydata$ModuleCode, "3[0-9][0-9][0-9]"), "Level 3",
ifelse(str_detect(mydata$ModuleCode, "4[0-9][0-9][0-9]"), "Level 4",
"Graduate Module"))))
# crosstabs to doublecheck
# xtabs( ~ ModuleCode + Level,
# data = mydata, subset = NULL)